Input




Preprocessing

Create gene-article matrix

### GET LIST OF GENES
list_gene_files = list.files('./../SFARI_scraping/genes/')


# GET LIST OF PUBMED IDS
get_pubmed_IDs = function(){
  all_pubmed_ids = c()
  for(gene_file in list_gene_files){
    file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
    pubmed_ids = unique(file$pubmed.ID)
    all_pubmed_ids = unique(c(all_pubmed_ids, pubmed_ids)) 
  }
  all_pubmed_ids = as.character(all_pubmed_ids)
  return(all_pubmed_ids)
}

all_pubmed_ids = get_pubmed_IDs()


# CREATE GENE-ARTICLE (SPARSE) MATRIX
SFARI_genes_info = read.csv('./../../Data/SFARI_genes_01-15-2019.csv') %>% mutate('ID'=gene.symbol)

create_gene_pmID_mat = function(){
  
  # Create empty dataframe
  gene_pmID_mat = data.frame(matrix(0, nrow=length(list_gene_files), ncol=length(all_pubmed_ids)))
  rownames(gene_pmID_mat) = sapply(list_gene_files, function(x) gsub('.csv', '', x))
  colnames(gene_pmID_mat) = all_pubmed_ids
  
  # Fillout dataframe
  for(gene_file in list_gene_files){
    file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
    pubmed_ids = as.character(unique(file$pubmed.ID))
    gene_pmID_mat[gsub('.csv', '', gene_file), pubmed_ids] = 1 
  }
  
  sparse_gene_pmID_mat = Matrix(as.matrix(gene_pmID_mat), sparse=TRUE)
  return(sparse_gene_pmID_mat)
}

gene_pmID_mat = create_gene_pmID_mat()

print(paste0('Genes: ', nrow(gene_pmID_mat), '     Articles: ', ncol(gene_pmID_mat)))
## [1] "Genes: 1053     Articles: 3709"
rm(list_gene_files, all_pubmed_ids)

Create article-MeSH term module membership matrix

article_module_mat = read.csv('./../Data/article_module_matrix_wo_qualifiers.csv')
rownames(article_module_mat) = article_module_mat$X
article_module_mat = Matrix(as.matrix(article_module_mat[,-1]), sparse=TRUE)

print(paste0('Articles: ', nrow(article_module_mat), '     Modules: ', ncol(article_module_mat)))
## [1] "Articles: 3345     Modules: 15"

Create gene-module membership matrix

gene_pmID_mat = gene_pmID_mat[,colnames(gene_pmID_mat) %in% rownames(article_module_mat)]

# Check that ordering is the same in both matrices
if(!all(colnames(gene_pmID_mat) == rownames(article_module_mat))){
  print("Article ordering doesn't match")
}

gene_module_mat = gene_pmID_mat %*% article_module_mat

print(paste0('Genes: ', nrow(gene_module_mat), '     Modules: ', ncol(gene_module_mat)))
## [1] "Genes: 1053     Modules: 15"

Remove genes without any associated modules (no MeSH terms)

to_keep = rowSums(gene_module_mat)>0
print(paste0('Removing ', sum(!to_keep), ' genes'))
## [1] "Removing 14 genes"

SFARI score distribution of removed genes:

SFARI_genes_info$gene.score[!SFARI_genes_info$ID %in% rownames(gene_module_mat)[to_keep]] %>% table(useNA='ifany')
## .
## 3 4 5 
## 3 6 5
# Remove genes
gene_module_mat = gene_module_mat[to_keep,]
rm(to_keep)

# Remove functions to preprocess data
rm(create_gene_pmID_mat, get_pubmed_IDs)

Analysis

Module 7 is the most related module by far

genes_by_module = data.frame('ID'=colnames(gene_module_mat), 'n_genes'=colSums(gene_module_mat))

ggplotly(genes_by_module %>% ggplot(aes(ID, n_genes, fill=sqrt(n_genes))) + geom_bar(stat='identity') + scale_y_sqrt() +
         ggtitle('No. genes related to each module') + xlab('Modules') + ylab('Related genes') + 
         theme_minimal() + theme(legend.position = 'none'))
rm(genes_by_module)

The higher the SFARI score, the bigger the module membership (makes sense because they have more papers and more MeSH terms). The same relation exists between syndromic and non-syndromic genes.

mesh_terms_by_gene = data.frame('ID'=rownames(gene_module_mat), 'n_Modules'=rowSums(gene_module_mat)) %>% 
                     left_join(SFARI_genes_info, by='ID') %>% dplyr::select(ID, n_Modules, gene.score, syndromic) %>%
                     mutate(gene.score = as.factor(gene.score), syndromic=as.factor(as.logical(syndromic)))

p1 = ggplotly(mesh_terms_by_gene %>% ggplot(aes(gene.score, n_Modules, fill=gene.score)) + geom_boxplot() + 
         xlab('Gene Score') + ylab('No. Modules') + theme_minimal() + theme(legend.position='none') +
         scale_fill_manual(values=SFARI_colour_hue()) + scale_color_manual(values=SFARI_colour_hue()))

p2 = ggplotly(mesh_terms_by_gene %>% ggplot(aes(syndromic, n_Modules, fill=syndromic)) + geom_boxplot() + 
         ggtitle('Module membership relation to each gene by score (left) and syndromic tag (right)') + 
         xlab('Syndromic') + ylab('No. Modules') + theme_minimal() + theme(legend.position='none'))

subplot(p1, p2, nrows=1, widths=c(0.7,0.3))
rm(p1, p2)

Since scores 5 and 6 have no syndromic genes, this tag can be a confounder, but even when separating the genes both by score and syndromic tag, the relation persists

ggplotly(mesh_terms_by_gene %>% ggplot(aes(gene.score, n_Modules, fill=gene.score)) + geom_boxplot() + 
         ggtitle('Modules membership relation to each gene by score and syndromic tag') + 
         xlab('Gene Score') + ylab('Module Memberships') + facet_wrap(~syndromic) +    
         scale_fill_manual(values=SFARI_colour_hue()) + scale_color_manual(values=SFARI_colour_hue()) +
         theme_minimal() + theme(legend.position='none'))
rm(mesh_terms_by_gene)

Gene Network Analysis

Create Network

On average, each node is connected to half of the nodes -> strongly connected Network

# EDGES
gene_gene_mat = gene_module_mat %*% t(gene_module_mat)
gene_names = rownames(gene_module_mat)

edges = summary(gene_gene_mat) %>% rename('from'=i, 'to'=j, 'count'=x) %>% 
        mutate(from = gene_names[from], to = gene_names[to]) %>%
        filter(from!=to)

# Convert count to fraction (multiplying by 100 to make the numbers bigger)
gene_module_mat_rowsums = rowSums(gene_module_mat)
names(gene_module_mat_rowsums) = rownames(gene_gene_mat)
edges = edges %>% mutate('weight'=count/(gene_module_mat_rowsums[from]+gene_module_mat_rowsums[to]))

# NODES
nodes = SFARI_genes_info %>% dplyr::select(gene.symbol, gene.score, syndromic, number.of.reports) %>% 
        mutate(id = gene.symbol, title=gene.symbol, value=number.of.reports,
               shape=ifelse(syndromic==0, 'circle','square'),
               color_score=case_when(gene.score==1~'#FF7631', gene.score==2~'#FFB100',
                                     gene.score==3~'#E8E328', gene.score==4~'#8CC83F',
                                     gene.score==5~'#62CCA6', gene.score==6~'#59B9C9',
                                     is.na(gene.score) ~ '#b3b3b3')) %>%
        mutate('color'=color_score) %>% filter(!duplicated(id)) %>% 
        filter(gene.symbol %in% unique(c(edges$from, edges$to)))

# Details:
print(paste0('Nodes: ', nrow(nodes),'        Edges: ', nrow(edges)/2))
## [1] "Nodes: 1039        Edges: 538919"
rm(gene_module_mat_rowsums, gene_names)

Calculate eigencentrality by Gene Score

The higher the SFARI score, the higher the eigencentrality of the genes in the Network. Same with syndromic tag

graph = graph_from_data_frame(edges, vertices=nodes, directed=FALSE)
graph = graph %>% simplify(edge.attr.comb='first')

eigen_centrality_output = eigen_centrality(graph)
eigencentr = as.numeric(eigen_centrality_output$vector)
graph = graph %>% set_vertex_attr('eigencentrality', value=eigencentr)

plot_data = data.frame('gene.score'= as.factor(vertex_attr(graph, 'gene.score')),
                       'Syndromic'= as.logical(vertex_attr(graph, 'syndromic')),
                       'Eigencentrality'= vertex_attr(graph, 'eigencentrality'))

correlation_scr = cor(nodes$gene.score[!is.na(nodes$gene.score)], eigencentr[!is.na(nodes$gene.score)])
correlation_syn = cor(as.numeric(plot_data$Syndromic), plot_data$Eigencentrality)

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, Eigencentrality, fill=gene.score)) + geom_boxplot() + 
         scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() + theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(Syndromic, Eigencentrality, fill=Syndromic)) + geom_boxplot() + 
     theme_minimal() + theme(legend.position='none') + 
     ggtitle(paste0('Correlation = ', round(correlation_scr, 2), ' (left) and ', round(correlation_syn,2), ' (right)')))

subplot(p1, p2, nrows=1, widths=c(0.7, 0.3))
rm(correlation_scr, correlation_syn, p1, p2)

When separating the SFARI scores by syndromic tag, the relation between eigencentrality and score persists

corr_non_syn = cor(nodes$gene.score[!is.na(nodes$gene.score) & nodes$syndromic==0], 
                   eigencentr[!is.na(nodes$gene.score) & nodes$syndromic==0])
corr_syn = cor(nodes$gene.score[!is.na(nodes$gene.score) & nodes$syndromic==1], 
               eigencentr[!is.na(nodes$gene.score) & nodes$syndromic==1])
ggplotly(plot_data %>% ggplot(aes(gene.score, Eigencentrality, fill=gene.score)) + geom_boxplot() + 
         scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() + theme(legend.position='none') +
         facet_wrap( ~ Syndromic, ncol=2) + ggtitle(paste0('Correlation = ', round(corr_non_syn, 4),
                                                           ' (left) and ', round(corr_syn, 4), ' (right)')))
rm(plot_data, corr_non_syn, corr_syn, nodes, edges)
plot_data = data.frame('id'=graph %>% get.vertex.attribute('name'), 'eigencentrality'=eigencentr) %>%
            top_n(25, wt=eigencentrality)
ggplotly(plot_data %>% ggplot(aes(reorder(id, eigencentrality), eigencentrality, fill=eigencentrality)) + 
         geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
         ggtitle(paste0('Top 25 genes with the highest Network centrality')) + 
         xlab('') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
rm(plot_data)

Note: Network is too densely connected to plot (too heavy and not very informative)

Community detection:

comms_louvain = cluster_louvain(graph)
modules = comms_louvain %>% membership

color_pal = viridis_pal()(max(modules)) %>% substr(1,7)
graph = graph %>% set_vertex_attr('module', value=as.numeric(modules)) %>% 
                  set_vertex_attr('color_modules', value=color_pal[as.numeric(modules)]) %>%
                  set_vertex_attr('color', value=color_pal[as.numeric(modules)])

comm_sizes = sizes(comms_louvain) %>% data.frame %>% rename(Module=Community.sizes, size=Freq)
ggplotly(comm_sizes %>% ggplot(aes(Module, size, fill=size)) + geom_bar(stat='identity') + 
         ggtitle('Number of genes in each module') + theme_minimal() + theme(legend.position = 'none'))
hm_data = table(vertex_attr(graph, 'gene.score'), vertex_attr(graph, 'module'), useNA='ifany') %>% 
          as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', Rowv=F, Colv=F, scale='row', trace='none', 
          key=FALSE, xlab='Module', ylab='SFARI score', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))

hm_data = table(vertex_attr(graph, 'syndromic'), vertex_attr(graph, 'module'), useNA='ifany') %>% 
          as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', Rowv=F, Colv=F, scale='row', trace='none', 
          key=FALSE, xlab='Module', ylab='Syndromic', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))

rm(comm_sizes)

Separating by syndromic tag the SFARI scores

hm_data = data.frame('syndromic_gene.score'=paste(vertex_attr(graph, 'syndromic'), '_',vertex_attr(graph, 'gene.score')),
                     'module'=vertex_attr(graph, 'module'))
hm_data = table(hm_data$syndromic_gene.score, hm_data$module) %>% as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', scale='row', trace='none', 
          key=FALSE, xlab='Module', ylab='Syndromic', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))

rm(color_pal, hm_data)

Module characterisation

Most important genes for each module

module_info = tibble('module'=character(), 'top_term'=character(), size=integer())

for(i in names(sizes(comms_louvain))){
  
  module_vertices = names(modules)[modules==i]
  
  # Creat subgraph
  module_graph = induced_subgraph(graph, module_vertices)
  
  # Calculate eigencentrality
  eigen_centrality_output = eigen_centrality(module_graph)
  eigencentr = as.numeric(eigen_centrality_output$vector)
  names(eigencentr) = module_vertices
  
  module_info = module_info %>% add_row('module'=i, 'top_term'=names(eigencentr)[eigencentr==max(eigencentr)][1], 
                                        'size'=length(module_vertices))
    
  # Plot eigencentrality of top 10 genes
  plot_data = data.frame('gene'=names(eigencentr), 'eigencentrality'=eigencentr) %>%
              top_n(10, wt=eigencentrality)
  print(plot_data %>% ggplot(aes(reorder(gene, eigencentrality), eigencentrality, fill=eigencentrality)) + 
       geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() + 
       ggtitle(paste0('Top genes for Module ', i)) + 
       xlab('Genes') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))

}

rm(i, module_vertices, module_graph, eigen_centrality_output, eigencentr, plot_data, module_info)

Most important MeSH term modules for each module

MeSH term top modules interpretations (kind of):

  • Module 2: Neuroscience

  • Module 3: Gene sequencing

  • Module 7: TF/Proteins

  • Module 9: DNA

  • Module 10: TF/Proteins

  • Module 14: Proteins

  • Module 19: GTP-Binding/Proteins

Module 1: Neuroscience

Module 2: Proteins? Only Module 7 is in the top modules with description

Module 3: Same as Module 2

for(i in names(sizes(comms_louvain))){
  
  module_genes = names(modules)[modules==i]
  
  # Filter genes belonging to module and column with non-zero values
  module_gene_module_mat = gene_module_mat[rownames(gene_module_mat) %in% module_genes,]
  module_gene_module_mat = module_gene_module_mat[,colSums(module_gene_module_mat)>0]
  
  # Do PCA and extract 1st PC
  pca_module = prcomp(module_gene_module_mat, scale.=TRUE)
  module_eigengene = pca_module$rotation[,'PC1']
  names(module_eigengene) = colnames(module_gene_module_mat)
  
  # Plot module's most relevant MeSH terms
  plot_data = data.frame('module'=names(module_eigengene), 'eigencentrality'=module_eigengene) %>% 
              mutate('module'=factor(module, levels=c('X1','X2','X3','X4','X5','X6','X7','X8','X9','X10',
                                                      'X11','X12','X13','X14','X15'), ordered=TRUE))
  print(plot_data %>% ggplot(aes(module, eigencentrality, fill=abs(eigencentrality))) + 
       geom_bar(position='identity', stat='identity') + scale_fill_viridis_c() + coord_flip() + 
       ggtitle(paste0('MeSH Term Module Membership for Module ', i)) + 
       xlab('MeSH Term Module') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
}

rm(i, modules, module_genes, module_gene_module_mat, pca_module, module_eigengene, plot_data)